package org.totoro;

import cn.hutool.core.io.FileUtil;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.util.regex.Pattern.compile;

/**
 * Hello world!
 */
public class App {
    public static void main(String[] args) {

//
//        String wordStr = FileUtil.readString("word.txt", "utf-8");
//
//
//        Pattern endCompile = compile("\\s{0,}总\\s{0,}计");
//        Matcher endMatcher = endCompile.matcher(wordStr);
//
//        endMatcher.find();
//        String substring = wordStr.substring(wordStr.indexOf(endMatcher.group()), wordStr.length() - 1);
//
//
//        Pattern startCompile = compile("(\\d\\.)(.|\\n)*?(\\d\\.)");
//
//        Matcher matcher = startCompile.matcher(substring);
//
//        while (matcher.find()){
//            System.out.println(matcher.group() );
//            System.out.println("----------------------------------");
//        }
//

        String wordStr = FileUtil.readString("word_copy.txt", "utf-8");


        ArrayList<String> list = getSplitText(wordStr);


        for (String block : list) {

            String[] split = block.split("\n");

            for (String line : split) {

                String word = getWord(line);

                String phoneticAlphabet = gePhoneticAlphabet(line);


            }


        }


        //  getRootWord(wordStr);


    }

    /**
     * 获取单词
     *
     * @param line
     */
    private static String getWord(String line) {
        Pattern word = compile("[a-z|A-Z]+-?[a-z]+\\s");

        Matcher matcher = word.matcher(line);

        if (!matcher.find()) {
            throw new RuntimeException("未匹配到单词" + line);
        }

        return matcher.group();
    }

    /**
     * 获取 音标
     *
     * @param line
     * @return
     */
    private static String gePhoneticAlphabet(String line) {
        Pattern word = compile("\\['?.*'?]");

        Matcher matcher = word.matcher(line);

        if (!matcher.find()) {
            throw new RuntimeException("未匹配到 音标" + line);
        }

        return matcher.group();
    }

    /**
     * 获取 中文翻译
     *
     * @param line
     * @return
     */
    private static String getTanslate(String line) {
        Pattern word = compile("[a-z|A-Z]*\\..*");

        Matcher matcher = word.matcher(line);

        if (!matcher.find()) {
            throw new RuntimeException("未匹配到中文 翻译" + line);
        }

        return matcher.group();
    }


    /**
     * 分割文本
     *
     * @param wordStr
     * @return
     */

    private static ArrayList<String> getSplitText(String wordStr) {

        ArrayList<String> list = new ArrayList<>();

        Pattern startCompile = compile("\\d{1,}\\.\\s*-\\w*\\(?\\w{0,}\\)?-\\s*");
        Matcher startMatcher = startCompile.matcher(wordStr);


        int offset = 0;
        while (startMatcher.find()) {
            if (offset == 0) {
                int startOffset = startMatcher.start();
                int endOffset = startMatcher.find() == true ? startMatcher.start() : wordStr.length() - 1;

                String substring = wordStr.substring(startOffset, endOffset);
                list.add(substring);
                offset = endOffset;
                continue;
            }

            int endOffset = startMatcher.start();
            String substring = wordStr.substring(offset, endOffset);
            list.add(substring);
            offset = endOffset;
        }

        String substring = wordStr.substring(offset, wordStr.length() - 1);
        list.add(substring);
        return list;
    }

    /**
     * 获取 词根
     *
     * @param wordStr
     */
    private static void getRootWord(String wordStr) {

        Pattern startCompile = compile("\\s{0,}词根统计\\s{0,}");
        Matcher startMatcher = startCompile.matcher(wordStr);

        Pattern endCompile = compile("\\s{0,}总\\s{0,}计");
        Matcher endMatcher = endCompile.matcher(wordStr);


        if (startMatcher.find() && endMatcher.find()) {

            String substring = wordStr.substring(wordStr.indexOf(startMatcher.group()), wordStr.indexOf(endMatcher.group()));

            String[] split = substring.split("\n");
            // 过滤 干扰项
            Pattern filterWordCompile = compile("\\d{0,}\\.\\s{0,}");
            // 词根 正则表达式
            Pattern wordRootCompile = compile("-\\s{0,}\\w{0,}\\(\\w{0,}\\)-|-\\w{0,}-");

            // 解释 正则表达式 [\u4e00-\u9fa5]
            Pattern wordExplanationCompile = compile("[\\u4e00-\\u9fa5]{1,}.{0,}[$\\u4e00-\\u9fa5]");

            for (String content : split) {
                Matcher wordMatcher = filterWordCompile.matcher(content);
                if (wordMatcher.find()) {

                    WordRoot wordRoot = new WordRoot();
                    wordRoot.setContent(content);

                    Matcher wordRootMatcher = wordRootCompile.matcher(content);

                    ArrayList<String> roots = new ArrayList<>();
                    // 获取词根
                    while (wordRootMatcher.find()) {
                        roots.add(wordRootMatcher.group());
                    }
                    wordRoot.setRoots(roots);

                    Matcher wordExplanationMatcher = wordExplanationCompile.matcher(content);

                    if (wordExplanationMatcher.find()) {
                        // 解释
                        String explanation = wordExplanationMatcher.group();
                        wordRoot.setRootExplanation(explanation);
                    }


                }


            }

        }
    }
}
